## 'data.frame': 39644 obs. of 62 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
## $ Popular : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 1 1 2 1 ...
list_cols_cnt=1
clist <- array()
for (i in 3:(ncol(super3_alt_ds)-1)) {
if ( length(which(super3_alt_ds[,i] <0)) > 0) {
print(paste(i,names(super3_alt_ds)[i],length(which(super3_alt_ds[,i] < 0))))
clist[list_cols_cnt] <- i
list_cols_cnt <- list_cols_cnt + 1
}
}## [1] "20 kw_min_min 22980"
## [1] "22 kw_avg_min 833"
## [1] "26 kw_min_avg 6"
## [1] "46 global_sentiment_polarity 3264"
## [1] "54 avg_negative_polarity 37094"
## [1] "55 min_negative_polarity 37094"
## [1] "56 max_negative_polarity 37094"
## [1] "58 title_sentiment_polarity 5786"
create SMOTE subset function to handle unbalanced response variable
subset_ds <- datachannel_subsets_ds('data_channel_is_world')
# Unbalanced Dataset
count(subset_ds,Popular)smote_world_ds <- SMOTE(Popular ~ ., subset_ds[,-45], perc.over = 120,perc.under=205)
# Balanced Dataset
count(smote_world_ds,Popular) Build a model and make predictions using stepwise regression
objective1_predictors <-c('global_rate_positive_words','global_subjectivity',
'average_token_length','num_hrefs','num_imgs','n_tokens_title','n_unique_tokens',
'LDA_04','LDA_02','weekday_is_tuesday','is_weekend','weekday_is_wednesday','n_non_stop_words',
'kw_max_min','kw_avg_max','num_self_hrefs','kw_avg_avg','kw_max_avg')
set.seed(123)
training.samples <- smote_world_ds$Popular %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- smote_world_ds[training.samples,]
test.data <- smote_world_ds[-training.samples,]
formula <- as.formula(paste('Popular',paste(objective1_predictors, collapse = " + "),sep = " ~ "))
model <- glm(formula, data=train.data, family=binomial) %>% stepAIC(trace = FALSE,direction="backward")Model Summary
##
## Call:
## glm(formula = Popular ~ global_rate_positive_words + global_subjectivity +
## average_token_length + num_hrefs + num_imgs + n_unique_tokens +
## LDA_02 + weekday_is_tuesday + is_weekend + weekday_is_wednesday +
## n_non_stop_words + kw_avg_max + num_self_hrefs + kw_avg_avg +
## kw_max_avg, family = binomial, data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7877 -1.0814 -0.7301 1.1395 2.2012
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.980e-01 1.850e-01 1.070 0.284504
## global_rate_positive_words 7.074e+00 1.893e+00 3.736 0.000187 ***
## global_subjectivity 1.534e+00 3.100e-01 4.950 7.44e-07 ***
## average_token_length -7.111e-01 1.016e-01 -7.001 2.55e-12 ***
## num_hrefs 2.179e-02 2.874e-03 7.583 3.38e-14 ***
## num_imgs 4.266e-02 5.356e-03 7.965 1.66e-15 ***
## n_unique_tokens 1.271e+00 2.978e-01 4.270 1.96e-05 ***
## LDA_02 -7.265e-01 1.088e-01 -6.677 2.44e-11 ***
## weekday_is_tuesday -1.914e-01 5.900e-02 -3.244 0.001180 **
## is_weekend 6.864e-01 6.544e-02 10.489 < 2e-16 ***
## weekday_is_wednesday -1.611e-01 5.844e-02 -2.756 0.005845 **
## n_non_stop_words 1.422e+00 5.366e-01 2.650 0.008051 **
## kw_avg_max -2.588e-06 2.869e-07 -9.021 < 2e-16 ***
## num_self_hrefs -2.417e-02 9.362e-03 -2.582 0.009821 **
## kw_avg_avg 5.148e-04 5.390e-05 9.552 < 2e-16 ***
## kw_max_avg -7.005e-05 1.070e-05 -6.546 5.90e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13186 on 9512 degrees of freedom
## Residual deviance: 12415 on 9497 degrees of freedom
## AIC: 12447
##
## Number of Fisher Scoring iterations: 4
Predictions and Confusion matrix
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Y", "N")
# Model accuracy
mean(predicted.classes==test.data$Popular)## [1] 0.6180059
cm_table <-table(predicted.classes, test.data$Popular)[c(2,1),c(2,1)]
## Confunsion Matrix
CM <- confusionMatrix(cm_table)
CM## Confusion Matrix and Statistics
##
##
## predicted.classes Y N
## Y 654 388
## N 520 815
##
## Accuracy : 0.618
## 95% CI : (0.5981, 0.6376)
## No Information Rate : 0.5061
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2349
##
## Mcnemar's Test P-Value : 1.378e-05
##
## Sensitivity : 0.5571
## Specificity : 0.6775
## Pos Pred Value : 0.6276
## Neg Pred Value : 0.6105
## Prevalence : 0.4939
## Detection Rate : 0.2751
## Detection Prevalence : 0.4384
## Balanced Accuracy : 0.6173
##
## 'Positive' Class : Y
##
Goodness of fit Test
# Goodness of fit test
trainingPopular <- if_else(train.data$Popular=='Y',1,0)
hoslem.test(trainingPopular, fitted(model),g = 10)##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: trainingPopular, fitted(model)
## X-squared = 29.851, df = 8, p-value = 0.0002246
Relationship between predictor and Logit
# Select only numeric predictors
probabilities <- predict(model, type = "response")
mydata <- train.data %>% dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>% mutate(logit = log(probabilities/(1-probabilities))) %>% gather(key = "predictors", value = "predictor.value", -logit)
model_preds <- names(model$coefficients)[-1]
for(i in 1:length(model_preds)) {
g <- mydata %>%
filter(predictors==model_preds[i]) %>%
ggplot(aes(x=logit, y=predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "lm") +
theme_bw()
print(g)
}Influential points Analysis
There are no influential points in this part of dataset
# Extract model results
model.data <- augment(model) %>%
mutate(index = 1:n())
model.data %>% top_n(3, .cooksd)ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = Popular), alpha = .5) +
theme_bw()